rm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
## [1] "C"
if (!require(dplyr)) install.packages("dplyr"); library(dplyr)
if (!require(ggplot2)) install.packages("ggplot2"); library(ggplot2)
if (!require(caTools)) install.packages("caTools"); library(caTools)
if (!require(d3heatmap)) install.packages("d3heatmap"); library(d3heatmap)
if (!require(qcc)) install.packages("qcc"); library(qcc)
if (!require(maps)) install.packages("maps"); library(maps)
if (!require(plotly)) install.packages("plotly"); library(plotly)
load("rdata/Z.rdata")
Ord <- left_join(Ord, Cust[,c(1,2)], by = "customer_id")
以客戶獨特的id分群產生:
- 訂單數量
- 平均購買價格
- 平均付款後到實際取貨時間
- 顧客類別(訂單數量大於1筆,為舊顧客)
EachCust <- Ord %>% group_by(customer_unique_id) %>% na.omit() %>%
summarize(buy_num=n(), buy_value=mean(order_value), delivery_days=mean(delivery_days)) %>%
mutate(cust=c("New Customer"))
EachCust$cust[EachCust$buy_num>1] = c("Old Customer")
dfCust <- EachCust %>% group_by(cust) %>% summarize(size=n()) %>%
mutate(cust = factor(cust, levels = c("New Customer", "Old Customer")),
cumulative = c(2800, 0),
midpoint = (cumulative + cumulative + size) / 2,
label = paste0(round(size / sum(size) * 100, 2), "%"))
ggplot(dfCust, aes(x = 1, weight = size, fill = cust)) +
geom_bar(width = 1, position = "stack") +
coord_polar(theta = "y") +
geom_text(aes(x = 1.3, y = midpoint, label = label)) +
theme_void() +
scale_fill_brewer(palette = "Greens")
dfCustValue <- EachCust %>% group_by(cust) %>% summarize(value=mean(buy_value))
ggplot(dfCustValue, aes(x=cust, y=value)) +
geom_col(width = 0.5)
Ord <- Ord %>% mutate(year=format(order_delivered_customer_date,'%Y'),
month=format(order_delivered_customer_date,'%m'),
weekday=format(order_delivered_customer_date,'%A'),
day=format(order_delivered_customer_date, '%m-%d'))
折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額
yearDf <- Ord %>% group_by(year) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
yearDf <- yearDf[c(-1),]
yearDf$year <- yearDf$year %>% as.numeric()
ggplot(data = yearDf) +
geom_bar(mapping = aes(y = value*max(yearDf$num)/max(yearDf$value), x = year), stat = "identity",
colour = gray(0.5), fill = gray(0.5), width = 0.5) +
geom_line(mapping = aes(y = num, x = year)) +
geom_point(mapping = aes(y = num, x = year), size = 3, shape = 21, fill = "white") +
scale_x_continuous(breaks=seq(2017, 2018, 1)) +
scale_y_continuous(name = "quantity of order", limits = c(0,max(yearDf$num)),
sec.axis = sec_axis(~. *max(yearDf$num)/max(yearDf$value), name = "sales")) +
ggtitle("Year") +
theme(plot.title = element_text(hjust = 0.5))
折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額
monthDf <- Ord %>% group_by(month) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
monthDf$month <- monthDf$month %>% as.numeric()
ggplot(data = monthDf) +
geom_bar(mapping = aes(y = value*max(monthDf$num)/max(monthDf$value), x = month), stat = "identity",
colour = gray(0.5), fill = gray(0.5)) +
geom_line(mapping = aes(y = num, x = month)) +
geom_point(mapping = aes(y = num, x = month), size = 3, shape = 21, fill = "white") +
scale_x_continuous(breaks=seq(1, 12, 1)) +
scale_y_continuous(name = "quantity of order", limits = c(0,max(monthDf$num)),
sec.axis = sec_axis(~. *max(monthDf$num)/max(monthDf$value), name = "sales")) +
ggtitle("Month") +
theme(plot.title = element_text(hjust = 0.5))
折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額
weekDf <- Ord %>% group_by(weekday) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
weekDf$weekday <- as.numeric(c(5,1,6,7,4,2,3))
weekDf <- weekDf[order(weekDf$weekday),]
ggplot(data = weekDf) +
geom_bar(mapping = aes(y = value*max(weekDf$num)/max(weekDf$value), x = weekday), stat = "identity",
colour = gray(0.5), fill = gray(0.5)) +
geom_line(mapping = aes(y = num, x = weekday)) +
geom_point(mapping = aes(y = num, x = weekday), size = 3, shape = 21, fill = "white") +
scale_x_continuous(breaks=seq(1, 7, 1)) +
scale_y_continuous(name = "quantity of order", limits = c(0,max(weekDf$num)),
sec.axis = sec_axis(~. *max(weekDf$num)/max(weekDf$value), name = "sales")) +
ggtitle("Week") +
theme(plot.title = element_text(hjust = 0.5))
折線圖的坐標軸為左邊的訂單數量
長條圖的坐標軸為右邊的營業額
此部分為對年做分群,在對月份做分群,得到整個期間每個月分的訂單數量與營業額
allDf <- Ord %>% group_by(year, month) %>% summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
allDf$year <- as.numeric(allDf$year)
allDf$month <- as.numeric(allDf$month)
allDf$date <- as.Date(as.character(allDf$year*10000+allDf$month*100+01), format = "%Y%m%d")
ggplot(data = allDf) +
geom_bar(mapping = aes(y = value*max(allDf$num)/max(allDf$value), x = date), stat = "identity",
colour = gray(0.5), fill = gray(0.5)) +
geom_line(mapping = aes(y = num, x = date)) +
geom_point(mapping = aes(y = num, x = date), size = 3, shape = 21, fill = "white") +
scale_y_continuous(name = "quantity of order", limits = c(0,max(allDf$num)),
sec.axis = sec_axis(~. *max(allDf$num)/max(allDf$value), name = "sales")) +
ggtitle("2016/10-2018/10") +
theme(plot.title = element_text(hjust = 0.5))
## 合併,刪除掉重複的列
Ord <- left_join(Ord, select(OrdPay[!duplicated(OrdPay$order_id),], order_id,
payment_type, payment_installments),
by = "order_id")
## 排除掉分期為0的列
NewPay <- Ord[-c(which(Ord$payment_installments == 0)),]
NewPay <- na.omit(NewPay)
NewPay$installment <- NewPay$payment_installments
NewPay$installment[NewPay$installment>=8] <- "8 above"
ggplot(NewPay, aes(x = 1, fill = factor(installment))) +
geom_bar(width = 1, position = "stack") +
coord_polar(theta = "y") +
theme_void() +
scale_fill_brewer(palette="Pastel1") +
ggtitle("Num of Installments") +
theme(plot.title = element_text(hjust = 0.5))
installMents <- NewPay$order_item_value[NewPay$payment_installments>1]
noninstallMents <- NewPay$order_item_value[NewPay$payment_installments==1]
summary(installMents)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.49 59.98 110.00 171.77 188.99 6735.00
summary(noninstallMents)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.85 34.90 59.99 100.77 113.00 13440.00
使用t.test檢定
H0: 有分期與無分期的平均售價相等
H1: 有分期與無分期的平均售價不相等
## F檢定:兩母體變異數是否有差異。p-value <0.05,母體變異數不相同
var.test(installMents, noninstallMents)
##
## F test to compare two variances
##
## data: installMents and noninstallMents
## F = 1.7816, num df = 49276, denom df = 47180, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.750116 1.813738
## sample estimates:
## ratio of variances
## 1.781646
## T檢定。p-value <0.05,表示有分期與無分期的平均售價有顯著差異
t.test(installMents, noninstallMents, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: installMents and noninstallMents
## t = 53.833, df = 91236, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 68.41958 73.58997
## sample estimates:
## mean of x mean of y
## 171.7728 100.7680
## 先找出含有多種產品的分類
BigCats = names((Prod$product_category_name_english %>% table))[(Prod$product_category_name_english %>% table) > 300 ]
## 若屬於大分類的,type則不變
Prod$product_type = ifelse(Prod$product_category_name_english %in% BigCats,
Prod$product_category_name_english,
NA)
## 合併子類別為大類別: product_type
Prod[ grepl("furniture", Prod$product_category_name_english) , "product_type"] = "furniture"
Prod[ grepl("art", Prod$product_category_name_english) , "product_type"] = "art"
Prod[ grepl("fashio", Prod$product_category_name_english) , "product_type"] = "fashion"
Prod[ grepl("construction_tools", Prod$product_category_name_english) , "product_type"] = "construction_tools"
Prod[ grepl("costruction_tools", Prod$product_category_name_english) , "product_type"] = "construction_tools"
Prod[ grepl("home", Prod$product_category_name_english) , "product_type"] = "home"
Prod[ grepl("books", Prod$product_category_name_english) , "product_type"] = "books"
Prod[ grepl("food", Prod$product_category_name_english) , "product_type"] = "food"
Prod[ grepl("drink", Prod$product_category_name_english) , "product_type"] = "food"
## 沒被合併到的子分類全部歸類為others
Prod[is.na(Prod$product_type), "product_type"] = "others"
Prod$product_type = as.factor(Prod$product_type)
## 查看type分布
table(Prod$product_type) %>% sort(decreasing = T)
##
## furniture bed_bath_table sports_leisure
## 3271 3029 2867
## health_beauty housewares auto
## 2444 2335 1900
## computers_accessories toys others
## 1639 1411 1403
## watches_gifts fashion telephony
## 1329 1221 1134
## baby perfumery stationery
## 919 868 849
## home cool_stuff garden_tools
## 832 789 753
## pet_shop construction_tools electronics
## 719 696 517
## books luggage_accessories consoles_games
## 370 349 317
## food art
## 267 100
新增Prod欄位
Prod = OrdRev %>%
## 計算每筆訂單
group_by(order_id) %>%
summarise(avgReviewScore = mean(review_score, na.rm = T),
avgCommentLength = mean(comment_length, na.rm = T),
avgCommentAnswerDelay = mean(answer_delay, na.rm = T)) %>%
right_join(OrdItm[,c("order_id", "product_id")], by = "order_id") %>%
## 多合併delivery_days欄位
left_join(Ord[Ord$delivery_days>0 ,c("order_id", "delivery_days")], by = "order_id") %>%
## 計算每個產品
group_by(product_id) %>%
summarise(avgDeliveryDays = mean(delivery_days, na.rm = T), # 平均運送天數
avgReviewScore = mean(avgReviewScore, na.rm = T), # 平均評論分數
avgCommentLength = mean(avgCommentLength, na.rm = T), # 平均評論長度
avgCommentAnswerDelay = mean(avgCommentAnswerDelay, na.rm = T)) %>% # 平均回覆天數
right_join(Prod, by="product_id")
summary(Prod)
## product_id avgDeliveryDays avgReviewScore avgCommentLength
## Length:32328 Min. : 1.00 Min. :1.000 Min. : 0.00
## Class :character 1st Qu.: 7.00 1st Qu.:3.520 1st Qu.: 0.00
## Mode :character Median : 10.00 Median :4.500 Median : 12.67
## Mean : 11.61 Mean :4.035 Mean : 30.89
## 3rd Qu.: 14.00 3rd Qu.:5.000 3rd Qu.: 45.00
## Max. :194.00 Max. :5.000 Max. :214.00
## NA's :738
## avgCommentAnswerDelay product_category_name product_name_lenght
## Min. : 0.000 cama_mesa_banho : 3029 Min. : 5.00
## 1st Qu.: 1.000 esporte_lazer : 2867 1st Qu.:42.00
## Median : 1.500 moveis_decoracao : 2657 Median :51.00
## Mean : 2.604 beleza_saude : 2444 Mean :48.47
## 3rd Qu.: 2.900 utilidades_domesticas: 2335 3rd Qu.:57.00
## Max. :512.000 automotivo : 1900 Max. :76.00
## (Other) :17096
## product_description_lenght product_photos_qty product_weight_g
## Min. : 4.0 Min. : 1.000 Min. : 0
## 1st Qu.: 339.0 1st Qu.: 1.000 1st Qu.: 300
## Median : 595.0 Median : 1.000 Median : 700
## Mean : 771.5 Mean : 2.189 Mean : 2277
## 3rd Qu.: 972.0 3rd Qu.: 3.000 3rd Qu.: 1900
## Max. :3992.0 Max. :20.000 Max. :40425
## NA's :1
## product_length_cm product_height_cm product_width_cm
## Min. : 7.00 Min. : 2.00 Min. : 6.00
## 1st Qu.: 18.00 1st Qu.: 8.00 1st Qu.: 15.00
## Median : 25.00 Median : 13.00 Median : 20.00
## Mean : 30.86 Mean : 16.96 Mean : 23.21
## 3rd Qu.: 38.00 3rd Qu.: 20.50 3rd Qu.: 30.00
## Max. :105.00 Max. :105.00 Max. :118.00
## NA's :1 NA's :1 NA's :1
## product_category_name_english noPurchase revenue
## Length:32328 Min. : 1.000 Min. : 2.20
## Class :character 1st Qu.: 1.000 1st Qu.: 59.97
## Mode :character Median : 1.000 Median : 138.75
## Mean : 3.434 Mean : 414.71
## 3rd Qu.: 3.000 3rd Qu.: 329.90
## Max. :527.000 Max. :63885.00
##
## product_type
## furniture : 3271
## bed_bath_table: 3029
## sports_leisure: 2867
## health_beauty : 2444
## housewares : 2335
## auto : 1900
## (Other) :16482
Y = Prod %>%
group_by(product_type) %>%
summarise(noProd = n(), # 有幾個不同的產品
noPurchase = sum(noPurchase), # 總共被購買次數
revenue = sum(revenue), # 總獲利
RevPerProd = revenue/noPurchase, # 商品平均獲利
photos_qty = mean(product_photos_qty), # 平均圖片數量
avgDeliveryDays = mean(avgDeliveryDays, na.rm = T), # 平均運送天數
avgReviewScore = mean(avgReviewScore, na.rm = T), # 平均評論分數
avgCommentLength = mean(avgCommentLength, na.rm = T), # 平均評論長度
avgCommentAnswerDelay = mean(avgCommentAnswerDelay, na.rm = T)) # 平均回覆天數
summary(Y)
## product_type noProd noPurchase
## art : 1 Min. : 100.0 Min. : 276
## auto : 1 1st Qu.: 701.8 1st Qu.: 1998
## baby : 1 Median : 893.5 Median : 3608
## bed_bath_table : 1 Mean :1243.4 Mean : 4270
## books : 1 3rd Qu.:1582.0 3rd Qu.: 5630
## computers_accessories: 1 Max. :3271.0 Max. :11115
## (Other) :20
## revenue RevPerProd photos_qty avgDeliveryDays
## Min. : 30502 Min. : 57.41 Min. :1.393 Min. : 8.659
## 1st Qu.: 205243 1st Qu.: 92.14 1st Qu.:2.025 1st Qu.:10.670
## Median : 405445 Median :115.43 Median :2.280 Median :11.529
## Mean : 515638 Mean :120.56 Mean :2.241 Mean :11.293
## 3rd Qu.: 842789 3rd Qu.:137.45 3rd Qu.:2.452 3rd Qu.:11.798
## Max. :1258681 Max. :244.06 Max. :3.169 Max. :13.405
##
## avgReviewScore avgCommentLength avgCommentAnswerDelay
## Min. :3.843 Min. :21.99 Min. :1.830
## 1st Qu.:3.993 1st Qu.:28.51 1st Qu.:2.348
## Median :4.075 Median :31.28 Median :2.540
## Mean :4.078 Mean :30.55 Mean :2.637
## 3rd Qu.:4.151 3rd Qu.:32.63 3rd Qu.:2.853
## Max. :4.377 Max. :38.97 Max. :4.792
##
g = Y %>% ggplot(aes(x=noPurchase, y=revenue, size=avgReviewScore, col=noProd)) +
geom_point(alpha = 0.3) +
geom_text(aes(label = product_type), size=7, check_overlap = TRUE, vjust = -0.7, nudge_y = 0.5) +
geom_hline(aes(yintercept=550000), colour="#990000", linetype="dashed") +
geom_vline(aes(xintercept=5500), colour="#BB0000", linetype="dashed") +
xlim(0,12000) + ylim(0, 1270000) +
labs(title ="Category", x = "num of buy", y = "sales")
g
ggsave(g, file = "Product Type.png",width = 12,height = 9)
## 先合併產品ID
Ord <- left_join(Ord, select(OrdItm[!duplicated(OrdItm$order_id),], order_id, product_id),
by="order_id")
## 再合併商品種類
Ord <- left_join(Ord, select(Prod, product_id, product_type),
by="product_id")
NewProd <- Ord %>% filter(product_type==c("bed_bath_table", "furniture", "health_beauty",
"sports_leisure", "computers_accessories"))
allProdDf <- NewProd %>% group_by(product_type, year, month) %>%
summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
allProdDf$year <- as.numeric(allProdDf$year)
allProdDf$month <- as.numeric(allProdDf$month)
allProdDf$date <- as.Date(as.character(allProdDf$year*10000+allProdDf$month*100+01), format = "%Y%m%d")
allProdDf <- allProdDf %>% filter(date>="2017-01-01")
allProdDf <- allProdDf %>% filter(date<="2018-08-01")
ggplot(data = allProdDf) +
geom_smooth(aes(y=value, x=date, col = product_type), method = "lm", se =F, linetype="dashed")
NewProd <- Ord %>% filter(product_type==c("books", "luggage_accessories", "consoles_games", "food", "art"))
allProdDf <- NewProd %>% group_by(product_type, year, month) %>%
summarize(num=n(), value=sum(order_item_value)) %>% na.omit()
allProdDf$year <- as.numeric(allProdDf$year)
allProdDf$month <- as.numeric(allProdDf$month)
allProdDf$date <- as.Date(as.character(allProdDf$year*10000+allProdDf$month*100+01), format = "%Y%m%d")
allProdDf <- allProdDf %>% filter(date>="2017-01-01")
allProdDf <- allProdDf %>% filter(date<="2018-08-01")
ggplot(data = allProdDf) +
geom_smooth(aes(y=value, x=date, col = product_type), method = "lm", se =F, linetype="dashed")
Prod = Prod[order(Prod$revenue, decreasing = T),]
Prod$id = seq(1, nrow(Prod), 1)
g = Prod %>%
filter(revenue > 3000) %>%
ggplot(aes(id, revenue)) +
geom_line(colour="steelblue", size=3) +
## geom_point(colour="navyblue", size=3)
labs(title ="Long-Tail Theory", x = "product", y = "revenue")
g
ggsave(g, file = "2080_Long Tail Theory.png",width = 7,height = 5)
ProductcatRev = group_by(Prod,product_category_name_english) %>%
summarize(totalproductrevenue=sum(revenue))
nrow(ProductcatRev)*0.2
## [1] 14.2
highrevenuecategory = sort(ProductcatRev$totalproductrevenue, decreasing = T)[1:14]
Twentyproduct_category_revenue = sum(highrevenuecategory) # 前20%產品類別收入
Totalproduct_category_revenue = sum(ProductcatRev$totalproductrevenue) # 總產品類別收入
Twentyproduct_category_revenue/Totalproduct_category_revenue # 前20%的產品收益佔全部的76%
## [1] 0.7529475
發現health_beauty,watch gifts,bed_bath_table,sports_leisure,computers_accessories是這個平台主力的商品
# pareto_revenue = ProductcatRev$totalproductrevenue
# names(pareto_revenue) = ProductcatRev$product_category_name_english
# pareto.chart(pareto_revenue,
# ylab = "Revenue",
# main = " Pareto Chart",
# cumperc = c(0,80,100))
OR = OrdRev %>% left_join(., Ord, by = "order_id")
## 移除重複的資料
OR = OR[!duplicated(OR),]
OR$review_score %>% table %>% prop.table()
## .
## 1 2 3 4 5
## 0.11858 0.03235 0.08287 0.19200 0.57420
0.19200 + 0.57420
## [1] 0.7662
0.11858 + 0.03235
## [1] 0.15093
OR %>% ggplot(aes(review_score)) + geom_histogram(aes(y=..count..), binwidth=0.5)
有15.093%的交易評分得到1or2顆星
OR$low_score = sapply(OR$review_score, function(x){
ifelse(x<4, TRUE, FALSE)
})
prop.table(table(OR$low_score))
##
## FALSE TRUE
## 0.7662 0.2338
OR = filter(OR, delivery_days>0 , delivery_days<100)
summary(OR$delivery_days)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 10.00 11.92 15.00 99.00
使用t.test檢定
H0: 低評分交易與高評分交易的delivery_days相等
H1: 低評分交易與高評分交易的delivery_days不相等
t.test(OR$delivery_days ~ OR$low_score)
##
## Welch Two Sample t-test
##
## data: OR$delivery_days by OR$low_score
## t = -71.625, df = 23915, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.816994 -6.453831
## sample estimates:
## mean in group FALSE mean in group TRUE
## 10.49402 17.12943
## p-value < 0.05
## 拒絕「高利潤商品的avg_review_score的相等」的虛無假設
OR %>% ggplot(aes(delivery_days, fill = low_score)) +
geom_histogram(aes(y = ..density..),position = "dodge") +
xlim(0, 100)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mean(OR[OR$low_score==F, "delivery_days"])
## [1] 10.49402
mean(OR[OR$low_score==T, "delivery_days"])
## [1] 17.12943
OR$order_status %>% table
## .
## approved canceled created delivered invoiced processing
## 0 6 0 96802 0 0
## shipped unavailable
## 0 0
g = OR %>% group_by(review_score) %>%
summarise(avgDeliveryDays = mean(delivery_days, na.rm=T), # 平均運送天數
avgAnswerDelay = mean(answer_delay, na.rm=T), # 平均回覆天數
avgItemCount = mean(order_item_count, na.rm=T), # 平均購買商品數
avgItemValue = mean(order_item_value, na.rm=T), # 平均客單價
avgFreightValue = mean(order_freight_value, na.rm=T), # 平均運費
deliveredProportion = mean(order_status=="canceled"), # 棄單率
deliveredProportion = mean(order_status=="delivered"), # 成單率
avgCommentLength = mean(comment_length, na.rm=T) # 平均評論長度
) %>%
ggplot(aes(x=avgDeliveryDays, y=avgFreightValue,
col=avgCommentLength, size=avgItemCount)) +
geom_point() +
geom_path(size = 1, alpha=.2) +
geom_text(aes(label = review_score), check_overlap = TRUE, vjust=-.5, size=6) +
ylim(21,28) +
labs(title ="Score of Review", x = "length of review", y = "freight")
g
ggsave(g, file = "Review Score.png",width = 7,height = 5)
## 由於評論的資料有部分訂單存在重複評分,因此計算其平均
NewOrdRev <- OrdRev %>% group_by(order_id) %>% summarize(review_score = mean(review_score))
## Ord與NewOrdRev合併
Ord <- left_join(Ord, NewOrdRev, by = "order_id")
## 時間長短所對應的平均分數
TimeScore <- Ord %>% group_by(delivery_days) %>%
summarize(score = mean(review_score), num = n())
## 排除掉小於20筆的資料
TimeScore <- TimeScore[TimeScore$num>=20,]
## 畫圖
ggplot(TimeScore, aes(x=delivery_days, y=score)) +
geom_col()
Brazil = map_data("world") %>% filter(region=="Brazil")
brazilPlot = ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")
# Removing some outliers
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
Geo = Geo[Geo$geolocation_lat <= 5.27438888,]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
Geo = Geo[Geo$geolocation_lng >= -73.98283055,]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
Geo = Geo[Geo$geolocation_lat >= -33.75116944,]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
Geo = Geo[Geo$geolocation_lng <= -34.79314722,]
location = Geo %>% group_by(geolocation_zip_code_prefix) %>%
summarise(lat = max(geolocation_lat),
lng = max(geolocation_lng))
COG = OrdPay[!duplicated(OrdPay$order_id),] %>%
right_join(Ord, by = "order_id") %>%
left_join(Cust, by="customer_id") %>%
left_join(location, by=c("customer_zip_code_prefix"="geolocation_zip_code_prefix"))
brazilPlot +
geom_point(data = Geo, aes(x=geolocation_lng, y=geolocation_lat, color=geolocation_state),size=0.2)
g = brazilPlot +
geom_point(data = COG,aes(x=lng,y=lat,color=customer_state),size=0.2)
g
## Warning: Removed 275 rows containing missing values (geom_point).
SOG = OrdItm %>%
left_join(., Seller, by="seller_id") %>%
left_join(.,location, by= c("seller_zip_code_prefix"="geolocation_zip_code_prefix"))
h = brazilPlot +
geom_point(data = SOG,aes(x=lng,y=lat,color=seller_state),size=0.2)
h
## Warning: Removed 253 rows containing missing values (geom_point).
States = COG %>% group_by(customer_state) %>%
summarise(
noCust = n_distinct(customer_id), # 總共有幾個顧客
noOrder = n(), # 總共有幾筆交易
noItem = sum(order_item_count), # 總共買幾個產品
avgItem = mean(order_item_count), # 平均每個交易買幾個產品
totalRevenue = sum(order_item_value), # 總共Revenue
avgRevenue = mean(order_item_value), # 平均每筆交易的Revenue (客單價)
avgFreight = mean(order_freight_value), # 平均每筆交易的freight運費
avgDeliveryDays = mean(delivery_days,na.rm=T), # 平均運送天數
cancelStatusProportion = mean(order_status=="canceled"), # 棄單比率
deliveredStatusProportion = mean(order_status=="delivered"), # 成單比率
payType_boleto = mean(payment_type.x == "boleto"), # boleto付款比率
payType_CreditCard = mean(payment_type.x == "credit_card"), # credit card付款比率
payType_debitCard = mean(payment_type.x == "debit_card"), # debit card付款比率
payType_voucher = mean(payment_type.x == "voucher") # voucher付款比率
)
States = Geo %>% group_by(geolocation_state) %>%
summarise(lng = mean(geolocation_lng),
lat = mean(geolocation_lat)) %>%
right_join(., States, by = c("geolocation_state" = "customer_state"))
summary(States)
## geolocation_state lng lat noCust
## AC : 1 Min. :-68.45 Min. :-29.680 Min. : 46
## AL : 1 1st Qu.:-51.63 1st Qu.:-19.988 1st Qu.: 378
## AM : 1 Median :-47.97 Median :-10.341 Median : 903
## AP : 1 Mean :-47.53 Mean :-12.453 Mean : 3654
## BA : 1 3rd Qu.:-40.03 3rd Qu.: -5.806 3rd Qu.: 2742
## CE : 1 Max. :-35.76 Max. : 2.717 Max. :41375
## (Other):21
## noOrder noItem avgItem totalRevenue
## Min. : 46 Min. : 52.0 Min. :1.080 Min. : 7829
## 1st Qu.: 378 1st Qu.: 414.5 1st Qu.:1.114 1st Qu.: 69618
## Median : 903 Median : 1055.0 Median :1.131 Median : 156454
## Mean : 3654 Mean : 4172.2 Mean :1.132 Mean : 503394
## 3rd Qu.: 2742 3rd Qu.: 3102.5 3rd Qu.:1.147 3rd Qu.: 406977
## Max. :41375 Max. :47449.0 Max. :1.206 Max. :5202955
##
## avgRevenue avgFreight avgDeliveryDays cancelStatusProportion
## Min. :125.8 Min. :17.37 Min. : 8.211 Min. :0.000000
## 1st Qu.:143.5 1st Qu.:24.89 1st Qu.:14.852 1st Qu.:0.001069
## Median :164.8 Median :36.44 Median :18.609 Median :0.003034
## Mean :164.1 Mean :34.39 Mean :18.154 Mean :0.003393
## 3rd Qu.:177.1 3rd Qu.:41.53 3rd Qu.:20.757 3rd Qu.:0.004120
## Max. :216.7 Max. :48.59 Max. :28.829 Max. :0.021739
##
## deliveredStatusProportion payType_boleto payType_CreditCard
## Min. :0.8913 Min. :0.1429 Min. :0.6765
## 1st Qu.:0.9693 1st Qu.:0.1764 1st Qu.:0.7182
## Median :0.9789 Median :0.2068 Median :0.7453
## Mean :0.9744 Mean :0.2126 Mean :0.7496
## 3rd Qu.:0.9839 3rd Qu.:0.2475 3rd Qu.:0.7802
## Max. :0.9887 Max. :0.2941 Max. :0.8367
## NA's :1 NA's :1
## payType_debitCard payType_voucher
## Min. :0.00000 Min. :0.00000
## 1st Qu.:0.01166 1st Qu.:0.02181
## Median :0.01380 Median :0.02713
## Mean :0.01316 Mean :0.02463
## 3rd Qu.:0.01537 3rd Qu.:0.02930
## Max. :0.02469 Max. :0.03854
## NA's :1 NA's :1
SCOG =COG %>% merge(., SOG, by = "order_id")
routes_count <- SCOG %>% group_by(customer_state, seller_state) %>%
summarise(cnt = n(),SumRev=sum(order_item_value))
dim(routes_count)
## [1] 417 4
routes_count %>% head() %>% knitr::kable()
| customer_state | seller_state | cnt | SumRev |
|---|---|---|---|
| AC | BA | 1 | 1200.00 |
| AC | DF | 1 | 199.00 |
| AC | GO | 1 | 98.99 |
| AC | MA | 1 | 66.99 |
| AC | MG | 10 | 2096.50 |
| AC | MS | 2 | 739.60 |
## 表格化賣家與買家地區物流次數
A = table(customer_state=SCOG$customer_state, seller_state=SCOG$seller_state)
## 依州別看出與其他州物流的密切程度
(scale(A)+1) %>% as.data.frame.matrix %>% d3heatmap(F,F,col=colorRamp(c('skyblue','lightyellow','red')) ,scale ='none')
g = brazilPlot +
geom_point(data = States,
aes(x=lng, y=lat, color = cancelStatusProportion, size = noOrder)
)
## 顏色為棄單率;大小為訂單數量
ggplotly(g)
g = brazilPlot +
geom_point(data = States,
aes(x=lng, y=lat, color = avgDeliveryDays, size = avgFreight)
)
## 顏色為平均到貨天數;大小為平均運費
ggplotly(g)
g = ggplot(States, aes(x=avgItem, y=avgRevenue, col=avgDeliveryDays)) +
geom_point(aes(size=noCust)) +
geom_text(aes(label=geolocation_state), size=4, check_overlap=T, nudge_y = 3)
ggplotly(g)